###############################################################################
# Setup and Libraries
###############################################################################

library(dplyr)
library(stringr)
library(countrycode)
library(readr)
library(tidyr)
library(purrr)


###############################################################################
# Function Definitions
###############################################################################

#' Fix country names in affiliation data
#' 
#' @param x Character string containing country or location information
#' @return Character string with standardized country name
#' @details Handles encoding issues, UK constituent countries, and specific institution/city cases
fix_country_names <- function(x) {
  case_when(
    # Fix encoding issues
    str_detect(x, "Brasil") ~ "Brazil",
    str_detect(x, "Perú") ~ "Peru",
    str_detect(x, "México") ~ "Mexico",
    
    # Handle UK constituent countries
    x %in% c("England", "Wales", "Scotland", "Northern Ireland", "WC1H 9SH") ~ "United Kingdom",
    
    # Fix specific institution/city cases
    str_detect(x, "University of Michigan|Utah") ~ "United States",
    str_detect(x, "Ottawa Hospital") ~ "Canada",
    x == "Santander" ~ "Spain",
    x == "Ethopia" ~ "Ethiopia",
    x == "Accra" ~ "Ghana",
    str_detect(x, "Harvard") ~ "United States",
    x == "Elderly and Children" ~ "Tanzania",
    x == "4006" ~ "Australia",
    
    # Default case: return original value
    TRUE ~ x
  )
}

#' Create dummy variables from comma-separated string columns
#' 
#' @param df Data frame containing the column to process
#' @param column_name Name of the column containing comma-separated strings
#' @param prefix Prefix for new dummy variable names
#' @param min_mentions Minimum frequency threshold for creating individual dummy variables
#' @return List containing modified data frame and value counts
#' @details Creates dummy variables for frequent values and an 'other' category for rare values
create_dummy_vars <- function(df, column_name, prefix, min_mentions = 5) {
  # Pre-split and clean all values at once
  all_values <- unlist(str_split(df[[column_name]], ",\\s*"))
  all_values <- str_trim(all_values)
  
  # Remove empty and NA values
  all_values <- all_values[!is.na(all_values) & all_values != ""]
  
  # Get value counts more efficiently
  value_counts <- sort(table(all_values), decreasing = TRUE)
  value_df <- data.frame(
    value = names(value_counts),
    freq = as.numeric(value_counts),
    row.names = NULL
  )
  
  # Split into frequent and rare
  frequent_values <- value_df$value[value_df$freq >= min_mentions]
  rare_values <- value_df$value[value_df$freq < min_mentions]
  
  # Prepare the column data once
  col_data <- df[[column_name]]
  
  # Create all dummy variables at once using a list
  dummy_cols <- lapply(frequent_values, function(value) {
    str_detect(col_data, fixed(value))
  })
  
  # Name the dummy columns
  clean_names <- paste0(prefix, "_", 
                        str_replace_all(tolower(frequent_values), "\\s+", "_"))
  names(dummy_cols) <- clean_names
  
  # Add other category if needed
  if(length(rare_values) > 0) {
    dummy_cols[[paste0(prefix, "_other")]] <- 
      sapply(col_data, function(x) {
        if(is.na(x) || x == "") return(FALSE)
        any(sapply(rare_values, function(v) str_detect(x, fixed(v))))
      })
  } else {
    dummy_cols[[paste0(prefix, "_other")]] <- FALSE
  }
  
  # Bind all new columns at once
  df <- bind_cols(df, as_tibble(dummy_cols))
  
  return(list(
    data = df,
    value_counts = value_df
  ))
}

#' Clean methodology entries
#' 
#' @param x Character string containing methodology information
#' @return Character string with standardized methodology name
#' @details Fixes misspellings, standardizes variants, and removes invalid entries
clean_methodology <- function(x) {
  # If x is a vector (from splitting), process each element
  if (length(x) > 1) {
    return(paste(sapply(x[x != ""], clean_methodology), collapse = ", "))
  }
  
  # For single values
  if (is.na(x) || str_trim(x) == "") return(NA_character_)
  
  case_when(
    # Fix major misspellings and variants
    x %in% c("Mixed Method", "MIxed Methods") ~ "Mixed Methods",
    x %in% c("Phenomological Analysis", "Phenomenlogical Analysis") ~ "Phenomenological Analysis",
    x == "Ground Theory Analysis" ~ "Grounded Theory Analysis",
    x == "Ground Theory" ~ "Grounded Theory Analysis",
    x == "Grounded Theory" ~ "Grounded Theory Analysis",
    x == "Ethnographic" ~ "Ethnography",
    x == "Conetent Analysis" ~ "Content Analysis",
    x == "Content Analyis" ~ "Content Analysis",
    x == "Comparative Analysis" ~ "Constant Comparative Analysis",
    x == "Inductive Content Analysis" ~ "Content Analysis",
    x == "Decriptive Analysis" ~ "Descriptive Analysis",
    x == "Theme Analysis" ~ "Thematic Analysis",
    x == "Modeling" ~ "Modelling",
    
    # Remove empty or invalid entries
    x %in% c("", "?") ~ NA_character_,
    
    # Default case: keep as is
    TRUE ~ x
  )
}

#' Fix misclassified method and type entries
#' 
#' @param df Data frame containing methodology and data_type columns
#' @return Data frame with corrected methodology and data_type entries
#' @details Corrects specific cases identified by URL
clean_method_type_entries = function(df) {
  df <- df %>%
    mutate(
      methodology = case_when(
        str_detect(url, "0187107") ~ "Content Analysis, Social Network Analysis",
        str_detect(url, "0236545") ~ "Coding",
        str_detect(url, "0261304") ~ "Thematic Analysis",
        str_detect(url, "0260640") ~ "Thematic Analysis",
        str_detect(url, "0217673") ~ "Thematic Analysis",
        TRUE ~ methodology
      ),
      data_type = case_when(
        str_detect(url, "0187107") ~ "Archival Documents, Interviews, Focus Groups",
        str_detect(url, "0236545") ~ "Archival Documents, Interviews",
        str_detect(url, "0261304") ~ "Surveys, Interviews, Focus Groups",
        str_detect(url, "0260640") ~ "Interviews",
        str_detect(url, "0217673") ~ "Surveys",
        str_detect(url, "0196811") ~ "Surveys",
        str_detect(url, "0192973") ~ "Interviews",
        str_detect(url, "0001947") ~ "Interviews",
        TRUE ~ data_type
      )
    )
  return(df)
}

#' Clean data type entries
#' 
#' @param x Character string containing data type information
#' @return Character string with standardized data type name
#' @details Standardizes variants, fixes misspellings, and removes misclassified entries
clean_data_type <- function(x) {

  case_when(
    # Fix major categories and their variants
    x %in% c("Interview", "Interivews", "Inteviews", "Interiviews") ~ "Interviews",
    x %in% c("Focus Group", "Foucs Groups", "Focus Gorups", "Focus Heoups") ~ "Focus Groups",
    x %in% c("Focus Groups (Brainstorming Sessions)") ~ "Focus Groups",
    x %in% c("Survey", "Surveys", "Suveys") ~ "Surveys",
    
    # Standardize archival documents
    x %in% c("Archival", "Archival Materials", 
             "Archival Documents (WhatsApp Messages)") ~ "Archival Documents",
    
    # Standardize quantitative data variants
    x %in% c("Quantitative (non-survey)", 
             "Quantitative Analysis (non-survey)",
             "Quantitative Data (non-survey )",
             "Quantitative Data (Non-Survey)",
             "Quanitative Data (non-survey)",
             "Quantitative Data (non survey)",
             "Quantitative Date (non survey)") ~ "Quantitative Data (non-survey)",
    
    # Standardize social media entries
    x %in% c("Social Media", "Social media/blog posts", 
             "Social Medi/Blog") ~ "Social Media/Blog Posts",
    
    # Standardize web sources
    x %in% c("Web Sources (Other)") ~ "Web Sources (other)",
    
    # Standardize media types
    x %in% c("Photos", "Photography") ~ "Photographs",
    x %in% c("Video", "Videos") ~ "Videos",
    x %in% c("Audio-Recording") ~ "Audio Recordings",
    
    # Standardize observations
    x %in% c("Observation") ~ "Observations",
    
    
    # Remove empty entries
    x == "" ~ NA_character_,
    
    # Default case: keep as is
    TRUE ~ x
  )
}


#' @description
#' Extracts the Digital Object Identifier (DOI) from a PLOS journal URL by capturing
#' the string between 'id=' and the following '&' character.
#' @param url A character string containing a PLOS journal URL
#' @return A character string containing the extracted DOI
extract_doi <- function(url) {
  stringr::str_extract(url, "(?<=id=)[^&]+")
}


#' Add citation counts from OpenAlex API
#'
#' @description
#' Fetches citation counts for DOIs one at a time from OpenAlex API,
#' with built-in rate limiting and error reporting.
#'
#' @param data A tibble containing DOIs
#' @param doi_col Name of the DOI column
#'
#' @return Original tibble with added cited_by_count column
#' @importFrom dplyr mutate
#' @export
add_citation_counts <- function(data, doi_col = "doi") {
  urls <- paste0(
    "https://api.openalex.org/works?filter=doi:https://doi.org/",
    utils::URLencode(data[[doi_col]])
  )
  
  counts <- numeric(nrow(data))
  
  for(i in seq_along(urls)) {
    response <- httr::GET(urls[i])
    content <- httr::content(response, "text", encoding = "UTF-8")
    
    tryCatch({
      result <- jsonlite::fromJSON(content)
      counts[i] <- result$results$cited_by_count
    }, error = function(e) {
      message("Error processing URL: ", urls[i])
      message("Response content: ", content)
      message("Error message: ", e$message)
      counts[i] <- NA_real_
    })
    
    if(i %% 10 == 0) {
      Sys.sleep(2)
    }
  }
  
  data |>
    dplyr::mutate(cited_by_count = counts)
}


# If we already have the citation counts in a CSV file, we can just merge them in
merge_citation_counts <- function(data, citation_file, doi_col = "doi") {
  # Load saved citations based on file type
  citations <- readr::read_csv(citation_file)
  
  # Merge with original data
  data |>
    dplyr::left_join(citations, by = c(doi_col))
}


#' Merge the has_supplement column from a supplements dataframe to a source dataframe
#'
#' @description This function merges just the has_supplement column from a supplementary 
#'   dataframe into the source dataframe using the URL column as the join key.
#'
#' @param df The source dataframe that will receive the has_supplement data
#' @param supplements The dataframe containing the has_supplement column to be merged
#'
#' @return A dataframe with all original columns from df plus the has_supplement column
#'
#' @details This function performs a left join using the 'url' column, so it preserves 
#'   all rows from the original dataframe. For rows without a matching URL in the 
#'   supplements dataframe, the has_supplement value will be NA.
#'
#' @examples
#' # If df and supplements are your dataframes:
#' # df_updated <- merge_supplement_data(df, supplements)
#'
#' @importFrom dplyr left_join select
#' @importFrom rlang .data
#'
#' @export
merge_supplement_data <- function(df, supplements) {
  # Ensure the supplements dataframe has the has_supplement column
  if(!"has_supplement" %in% colnames(supplements)) {
    stop("The supplements dataframe does not contain the 'has_supplement' column")
  }
  
  # Ensure both dataframes have a url column
  if(!"url" %in% colnames(df) || !"url" %in% colnames(supplements)) {
    stop("Both dataframes must contain a 'url' column for joining")
  }
  
  # Perform the join using dplyr
  df_merged <- df |>
    dplyr::left_join(
      supplements |> dplyr::select(url, has_supplement),
      by = "url"
    )
  
  # Report how many rows were matched
  match_count <- sum(!is.na(df_merged$has_supplement))
  total_count <- nrow(df)
  
  message(sprintf("Merged has_supplement data for %d out of %d rows (%.1f%%)",
                  match_count, total_count, match_count/total_count*100))
  
  return(df_merged)
}

###############################################################################
# Data Cleaning
###############################################################################

# Read data
df = read_csv("data/raw/plos_fully_coded.csv")

# correctly encoded metadata (there are some encoding issues in the raw data that was coded)
metadata = read_csv("data/raw/plos_metadata.csv")

# supplements retrieved via check-supplements function (this was added later in the analysis, so required renewed scraping)
supplements = read_rds("data/raw/added_supplements.rds")

# merge in supplements
df = merge_supplement_data(df, supplements)

metadata = metadata |> distinct(url, .keep_all = TRUE)
# Replace the specified columns with correctly encoded versions
df <- df %>%
  # Left join to keep only rows in df
  left_join(
    metadata %>% 
      select(url, article_title, data_availability, funding, 
             abstract, authors, affiliations),
    by = "url"
  ) %>%
  # Use coalesce() if you want to keep original values where metadata has NA
  # Or just use the .y version if you want to replace everything
  mutate(
    article_title = coalesce(article_title.y, article_title.x),
    data_availability = coalesce(data_availability.y, data_availability.x),
    funding = coalesce(funding.y, funding.x),
    abstract = coalesce(abstract.y, abstract.x),
    authors = coalesce(authors.y, authors.x),
    affiliations = coalesce(affiliations.y, affiliations.x)
  ) %>%
  # Clean up by removing the temporary .x and .y columns
  select(-ends_with(".x"), -ends_with(".y"))


# Fix method/type miscodings
df = clean_method_type_entries(df)

# Clean methodology entries
df <- df %>%
  mutate(methodology = map_chr(
    str_split(methodology, ",\\s*"),
    ~ paste(na.omit(clean_methodology(str_trim(.x))), collapse = ", ")
  ))

# Clean data type entries
df <- df %>%
  mutate(data_type = map_chr(
    str_split(data_type, ",\\s*"),
    ~ paste(na.omit(clean_data_type(str_trim(.x))), collapse = ", ")
  ))


###############################################################################
# Variable Generation and Recoding
###############################################################################
df <- df %>%
  mutate(first_author_affil = str_extract(affiliations, "^[^;]+"),
         first_author_country = str_extract(first_author_affil, "[^,]+$") %>% 
           str_trim() %>% fix_country_names(),
         first_author_continent = countrycode(
           first_author_country,
           origin = 'country.name',
           destination = 'continent'
         ))


# Create dummy variables for methodology and data_type
result_method <- create_dummy_vars(df, "methodology", "method",  min_mentions = 15)
result_type <- create_dummy_vars(result_method$data, "data_type", "type", min_mentions = 10)
df <- result_type$data


# Create funding_binary variable that is FALSE for NA/empty, or "no specific funding" in the string.
df <- df %>%
  mutate(funding_binary = !is.na(funding) & 
           funding != "" & 
           !str_detect(tolower(funding), "no specific funding"))


# Recode existing dummy variables from 1/NA to TRUE/FALSE
df <- df %>%
  mutate(
    # Recode on_request variables
    across(starts_with("on_request_"), ~ !is.na(.)),
    # Recode has supplement
    across(starts_with("has_supplement"), ~ !is.na(.)),    
    # Recode data_ variables except data_type and data_availability
    across(starts_with("data_") & 
             !matches("data_type|data_availability"), ~ !is.na(.)),
  )

# create new dummy variable data_combination if the DAS contains a combination of (i.e. 2 or more) data on request, data on repository, or data in paper and/or SI
df$data_combination <- (df$data_on_request + df$data_in_repository + df$data_in_paper_andor_SI) >= 2


# Add the new doi column
df <- df |> 
  dplyr::mutate(doi = extract_doi(url))


# Add OpenAlex cited_by counts. Reading these in from separate file, the code to get them via  the open Alex API is commented out below

# df = df |> add_citation_counts()
# 
# 
# # Write the citation count to a separate CSV file
# citations_to_save <- df |>
#   dplyr::select(doi, cited_by_count) |> readr::write_csv("data/raw/citation_counts.csv")

df <- df |> merge_citation_counts("data/raw/citation_counts.csv")


# # Print results
# print("Methodology frequencies:")
# print(result_method$value_counts)
# print("\nData type frequencies:")
# print(result_type$value_counts)

# Drop the index column which we no longer need
df <- df |> dplyr::select(-index)
###############################################################################
# Save Cleaned Data
###############################################################################

# Save as CSV
write_csv(df, "data/analysis/plos_cleaned_data.csv")

# Save as RDS
saveRDS(df, "data/analysis/plos_cleaned_data.rds")